This analysis utilizes the data found in a Kaggle competition where competitors seek to predict housing prices in King County, WA, USA (https://bit.ly/2lRv48E). The data set provides fields including the number of bedrooms, number of bathrooms, number of floors, square footage of the living room, square footage of the overall lot, among others. The time frame this data set covers are the years 2014 through 2015. Therefore, this analysis will utilize a number of machine learning models and techniques in order to achieve the best possible models (in this instance measured using MAPE, MAE, MSE, and R-Squared with emphasis on the MAPE score).
raw_train_df <- fread('Data/house_price_train.csv', stringsAsFactors = F)
raw_test_df <- fread('Data/house_price_test.csv', stringsAsFactors = F)
str(raw_train_df)
## Classes 'data.table' and 'data.frame': 17277 obs. of 21 variables:
## $ id :integer64 9183703376 464000600 2224079050 6163901283 6392003810 7974200948 2426059124 2115510300 ...
## $ date : chr "5/13/2014" "8/27/2014" "7/18/2014" "1/30/2015" ...
## $ price : num 225000 641250 810000 330000 530000 ...
## $ bedrooms : int 3 3 4 4 4 4 4 3 4 3 ...
## $ bathrooms : num 1.5 2.5 3.5 1.5 1.75 3.5 3.25 2.25 2.5 1.5 ...
## $ sqft_living : int 1250 2220 3980 1890 1814 3120 4160 1440 2250 2540 ...
## $ sqft_lot : int 7500 2550 209523 7540 5000 5086 47480 10500 6840 9520 ...
## $ floors : num 1 3 2 1 1 2 2 1 2 1 ...
## $ waterfront : int 0 0 0 0 0 0 0 0 0 0 ...
## $ view : int 0 2 2 0 0 0 0 0 0 0 ...
## $ condition : int 3 3 3 4 4 3 3 3 3 3 ...
## $ grade : int 7 10 9 7 7 9 10 8 9 8 ...
## $ sqft_above : int 1250 2220 3980 1890 944 2480 4160 1130 2250 1500 ...
## $ sqft_basement: int 0 0 0 0 870 640 0 310 0 1040 ...
## $ yr_built : int 1967 1990 2006 1967 1951 2008 1995 1983 1987 1959 ...
## $ yr_renovated : int 0 0 0 0 0 0 0 0 0 0 ...
## $ zipcode : int 98030 98117 98024 98155 98115 98115 98072 98023 98058 98115 ...
## $ lat : num 47.4 47.7 47.6 47.8 47.7 ...
## $ long : num -122 -122 -122 -122 -122 ...
## $ sqft_living15: int 1260 2200 2220 1890 1290 1880 3400 1510 2480 1870 ...
## $ sqft_lot15 : int 7563 5610 65775 8515 5000 5092 40428 8125 7386 6800 ...
## - attr(*, ".internal.selfref")=<externalptr>
summary(raw_train_df)
## id date price
## Min. : 1000102 Length:17277 Min. : 78000
## 1st Qu.:2113701080 Class :character 1st Qu.: 320000
## Median :3902100205 Mode :character Median : 450000
## Mean :4566440237 Mean : 539865
## 3rd Qu.:7302900090 3rd Qu.: 645500
## Max. :9900000190 Max. :7700000
## bedrooms bathrooms sqft_living sqft_lot
## Min. : 1.000 Min. :0.500 Min. : 370 Min. : 520
## 1st Qu.: 3.000 1st Qu.:1.750 1st Qu.: 1430 1st Qu.: 5050
## Median : 3.000 Median :2.250 Median : 1910 Median : 7620
## Mean : 3.369 Mean :2.114 Mean : 2080 Mean : 15186
## 3rd Qu.: 4.000 3rd Qu.:2.500 3rd Qu.: 2550 3rd Qu.: 10695
## Max. :33.000 Max. :8.000 Max. :13540 Max. :1164794
## floors waterfront view condition
## Min. :1.000 Min. :0.000000 Min. :0.0000 Min. :1.000
## 1st Qu.:1.000 1st Qu.:0.000000 1st Qu.:0.0000 1st Qu.:3.000
## Median :1.500 Median :0.000000 Median :0.0000 Median :3.000
## Mean :1.493 Mean :0.007467 Mean :0.2335 Mean :3.413
## 3rd Qu.:2.000 3rd Qu.:0.000000 3rd Qu.:0.0000 3rd Qu.:4.000
## Max. :3.500 Max. :1.000000 Max. :4.0000 Max. :5.000
## grade sqft_above sqft_basement yr_built
## Min. : 3.00 Min. : 370 Min. : 0.0 Min. :1900
## 1st Qu.: 7.00 1st Qu.:1190 1st Qu.: 0.0 1st Qu.:1951
## Median : 7.00 Median :1564 Median : 0.0 Median :1975
## Mean : 7.66 Mean :1791 Mean : 289.4 Mean :1971
## 3rd Qu.: 8.00 3rd Qu.:2210 3rd Qu.: 556.0 3rd Qu.:1997
## Max. :13.00 Max. :9410 Max. :4820.0 Max. :2015
## yr_renovated zipcode lat long
## Min. : 0.00 Min. :98001 Min. :47.16 Min. :-122.5
## 1st Qu.: 0.00 1st Qu.:98033 1st Qu.:47.47 1st Qu.:-122.3
## Median : 0.00 Median :98065 Median :47.57 Median :-122.2
## Mean : 85.35 Mean :98078 Mean :47.56 Mean :-122.2
## 3rd Qu.: 0.00 3rd Qu.:98117 3rd Qu.:47.68 3rd Qu.:-122.1
## Max. :2015.00 Max. :98199 Max. :47.78 Max. :-121.3
## sqft_living15 sqft_lot15
## Min. : 460 Min. : 659
## 1st Qu.:1490 1st Qu.: 5100
## Median :1840 Median : 7639
## Mean :1986 Mean : 12826
## 3rd Qu.:2360 3rd Qu.: 10080
## Max. :6210 Max. :871200
head(raw_train_df)
## id date price bedrooms bathrooms sqft_living sqft_lot
## 1: 9183703376 5/13/2014 225000 3 1.50 1250 7500
## 2: 464000600 8/27/2014 641250 3 2.50 2220 2550
## 3: 2224079050 7/18/2014 810000 4 3.50 3980 209523
## 4: 6163901283 1/30/2015 330000 4 1.50 1890 7540
## 5: 6392003810 5/23/2014 530000 4 1.75 1814 5000
## 6: 7974200948 5/20/2014 953007 4 3.50 3120 5086
## floors waterfront view condition grade sqft_above sqft_basement
## 1: 1 0 0 3 7 1250 0
## 2: 3 0 2 3 10 2220 0
## 3: 2 0 2 3 9 3980 0
## 4: 1 0 0 4 7 1890 0
## 5: 1 0 0 4 7 944 870
## 6: 2 0 0 3 9 2480 640
## yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
## 1: 1967 0 98030 47.3719 -122.215 1260 7563
## 2: 1990 0 98117 47.6963 -122.393 2200 5610
## 3: 2006 0 98024 47.5574 -121.890 2220 65775
## 4: 1967 0 98155 47.7534 -122.318 1890 8515
## 5: 1951 0 98115 47.6840 -122.281 1290 5000
## 6: 2008 0 98115 47.6762 -122.288 1880 5092
#Check for null values
sum(is.na(raw_train_df))
## [1] 0
sum(is.na(raw_test_df))
## [1] 0
As time series modeling will not be utilized in this analysis, the day, month, and year of each purchase will be individually parsed out rather than using the date time field.
clean_train_df <- raw_train_df
clean_test_df <- raw_test_df
# Train Data set
clean_train_df$date <- as.Date(raw_train_df$date, "%m/%d/%Y")
clean_train_df$year <- year(clean_train_df[,clean_train_df$date])
clean_train_df$month <- month(clean_train_df[,clean_train_df$date])
clean_train_df$day <- day(clean_train_df[,clean_train_df$date])
clean_train_df$day_of_week <- as.POSIXlt(as.Date(clean_train_df$date, "%m/%d/%Y"))$wday
# Test Data Set
clean_test_df$date <- as.Date(clean_test_df$date, "%m/%d/%Y")
clean_test_df$year <- year(clean_test_df[,clean_test_df$date])
clean_test_df$month <- month(clean_test_df[,clean_test_df$date])
clean_test_df$day <- day(clean_test_df[,clean_test_df$date])
clean_test_df$day_of_week <- as.POSIXlt(as.Date(clean_test_df$date, "%m/%d/%Y"))$wday
In order readable visualizations, a random selection of 1000 houses will be taken for all subsequent graphs/charts. Based on the histogram outputs, it appears that at least some of the variables are not normally distributed.
set.seed(12345)
sqft_hist <- c('sqft_living', 'sqft_lot', 'sqft_above', 'sqft_living15', 'sqft_lot15')
stats_hist <- c('bedrooms', 'floors','condition', 'grade')
#Randomly Sample 1000 values
df.1000 <- clean_train_df[sample(nrow(clean_train_df), 1000),]
multiple_hist(df.1000, sqft_hist)
multiple_hist(df.1000, stats_hist)
single_hist(df.1000$yr_built, "Year Built")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
single_hist(df.1000$yr_renovated, "Year Renovated")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
single_hist(df.1000$price, "Price")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
single_hist(df.1000$sqft_basement, "Basement Area")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
The following scatter plots seek to visual describe the relationship between the numerical variables and the target price variable. From this sample, it can be seen that the area of the living room and number of bathrooms in particular have a strong positive relationship (an intuitive observation from a business perspective as they factors are often explicitly taken into account with regards to pricing).
#Create dataframe with only numerical variables
numerical_var <- c('bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'sqft_above', 'sqft_basement', 'yr_built', 'sqft_living15', 'sqft_lot15', 'price')
scatter_df <- clean_train_df[,..numerical_var]
var_list <- names(scatter_df)[1:(length(scatter_df)-1)]
#Create list of ggplots of each numerical variable against price
plot_list <- lapply(var_list, gg_scatter, df = scatter_df)
do.call(grid.arrange, plot_list)
The following map allows for the visualization of the where houses of different price bands (based on quartiles) are located.
#Bin into quartiles for data visualization
df.1000$bin <- factor(Hmisc::cut2(df.1000$price, g = 4), labels = c(1:4))
colorsmap <- colors()[c(490,24,100,657)]
map <- leaflet(data.frame(df.1000)) %>%
addTiles() %>% # Add default OpenStreetMap map tiles
addCircleMarkers(lng=~long, lat=~lat,
popup= paste0("Number of Bedrooms: ", df.1000$bedrooms, sep="\n",
"Number of Bathrooms: ", df.1000$bathrooms, sep="\n",
"Living Room Size: " , df.1000$sqft_living, sep="\n",
"Lot Size: ", df.1000$sqft_lot, sep="\n",
"Number of Floors: ", df.1000$floors, sep="\n",
"Current Condition: ", df.1000$condition),
color= ~colorsmap,
group= unique(df.1000$bin)) #%>%
# This seems to be no longer supported
# addLegend(position = 'bottomright', colors = colorsmap, labels = unique(df.1000$bin))
# addLegend(map, position = 'bottomright', colors = colorsmap, labels = unique(df.1000$bin))
map
The following charts look at the distributions of each numerical value to visually see outliers. In this sample there are clear outliers for all variables other than the number of floors and the year the home was built.
for (var in numerical_var[1:(length(numerical_var)-1)]){
univariate_outlier(clean_test_df, var)
}
## [1] "Outliers: 5"
## [1] "Outliers: 9"
## [1] "Outliers: 93"
## [1] "Outliers: 433"
## [1] "Outliers: 0"
## [1] "Outliers: 76"
## [1] "Outliers: 50"
## [1] "Outliers: 0"
## [1] "Outliers: 67"
## [1] "Outliers: 386"
The following charts look at the distributions of each numerical value to visually see outliers on a monthly and daily basis to help understand any temporal patterns. Once again, outliers occur in all variables other than the number of floors and the year built. It is interesting to note that relatively speaking, a greater proportion of outliers are occur on weekdays.
for (var in numerical_var[1:(length(numerical_var)-1)]){
bivariate_outlier(clean_test_df, var)
}
As the above charts demonstrate that outliers are present, they will be clipped if they lie outside a 95% distribution band.
clipped_outliers <- lapply(clean_train_df[,..numerical_var], clip, lower_bound = .025, upper_bound = .975)
clipped_outliers_df <- as.data.table(matrix(unlist(clipped_outliers), nrow=length(unlist(clipped_outliers[1]))))
clean_train_df[,numerical_var] <- clipped_outliers_df
head(clean_train_df)
## id date price bedrooms bathrooms sqft_living sqft_lot
## 1: 9183703376 2014-05-13 225000 3 1.50 1250 7500.0
## 2: 464000600 2014-08-27 641250 3 2.50 2220 2550.0
## 3: 2224079050 2014-07-18 810000 4 3.50 3980 87509.8
## 4: 6163901283 2015-01-30 330000 4 1.50 1890 7540.0
## 5: 6392003810 2014-05-23 530000 4 1.75 1814 5000.0
## 6: 7974200948 2014-05-20 953007 4 3.50 3120 5086.0
## floors waterfront view condition grade sqft_above sqft_basement
## 1: 1 0 0 3 7 1250.0 0
## 2: 3 0 2 3 10 2220.0 0
## 3: 2 0 2 3 9 3840.5 0
## 4: 1 0 0 4 7 1890.0 0
## 5: 1 0 0 4 7 944.0 870
## 6: 2 0 0 3 9 2480.0 640
## yr_built yr_renovated zipcode lat long sqft_living15 sqft_lot15
## 1: 1967 0 98030 47.3719 -122.215 1260 7563
## 2: 1990 0 98117 47.6963 -122.393 2200 5610
## 3: 2006 0 98024 47.5574 -121.890 2220 60548
## 4: 1967 0 98155 47.7534 -122.318 1890 8515
## 5: 1951 0 98115 47.6840 -122.281 1290 5000
## 6: 2008 0 98115 47.6762 -122.288 1880 5092
## year month day day_of_week
## 1: 2014 5 13 2
## 2: 2014 8 27 3
## 3: 2014 7 18 5
## 4: 2015 1 30 5
## 5: 2014 5 23 5
## 6: 2014 5 20 2
heatmap_data<-clean_train_df[, !c('id','date')]
d3heatmap::d3heatmap(cor(heatmap_data))
This analysis will initially compare the results of Lasso Linear Regression, Ranger’s implementation of Random Forest, and XG Boost to determine which algorithm will be used going forward.
split_clean_train_df <- f_partition(clean_train_df, test_proportion = 0.2, seed = 123456)
split_clean_train_df$train$date = NULL
split_clean_train_df$test$date = NULL
glmnet_cv<-cv.glmnet(x = data.matrix(split_clean_train_df$train[, !c('id','price')]), nfolds = 5,
y = split_clean_train_df$train[['price']],
alpha=1, family = 'gaussian', standardize = T)
plot.cv.glmnet(glmnet_cv)
glmnet_cv$lambda.min
## [1] 300.9572
glmnet_0<-glmnet(x = data.matrix(split_clean_train_df$train[, !c('id','price')]),
y = split_clean_train_df$train[['price']],
family = 'gaussian',
alpha=1, lambda = glmnet_cv$lambda.min)
print(glmnet_0)
##
## Call: glmnet(x = data.matrix(split_clean_train_df$train[, !c("id", "price")]), y = split_clean_train_df$train[["price"]], family = "gaussian", alpha = 1, lambda = glmnet_cv$lambda.min)
##
## Df %Dev Lambda
## [1,] 21 0.7467 301
glmnet_0$beta
## 22 x 1 sparse Matrix of class "dgCMatrix"
## s0
## bedrooms -1.809343e+04
## bathrooms 2.520548e+04
## sqft_living 1.157481e+02
## sqft_lot 4.479120e-01
## floors 2.318851e+04
## waterfront 2.048945e+05
## view 4.510006e+04
## condition 2.692461e+04
## grade 9.492991e+04
## sqft_above 1.299709e+00
## sqft_basement -9.948718e+00
## yr_built -2.217149e+03
## yr_renovated 1.958915e+01
## zipcode -4.090499e+02
## lat 5.681984e+05
## long -1.367021e+05
## sqft_living15 4.209482e+01
## sqft_lot15 -9.499088e-01
## year 2.948837e+04
## month .
## day -1.298241e+02
## day_of_week 1.656678e+03
test_glmnet<-predict(glmnet_0, newx = data.matrix(split_clean_train_df$test[,!c('id','price')]))
df_pred<-split_clean_train_df$test[, .(id=1:.N, price, test_glmnet)]
str(df_pred)
## Classes 'data.table' and 'data.frame': 3456 obs. of 3 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ price : num 810000 953007 495000 1080000 705000 ...
## $ test_glmnet: num 870918 749509 241732 864831 833831 ...
## - attr(*, ".internal.selfref")=<externalptr>
rmse_glmnet<-rmse(real=split_clean_train_df$test$price, predicted = test_glmnet)
mae_glmnet<-mae(real=split_clean_train_df$test$price, predicted = test_glmnet)
mape_glmnet<-mape(real=split_clean_train_df$test$price, predicted = test_glmnet)
mape_glmnet
## [1] 0.2211987
rsq_glment<-custom_rsq(real=split_clean_train_df$test$price, predicted = test_glmnet)
rsq_glment
## [1] 0.7629821
baseline_rf <- ranger(formula = as.formula(price~.), data=split_clean_train_df$train[,!c('id')], importance = 'impurity')
print(baseline_rf)
## Ranger result
##
## Call:
## ranger(formula = as.formula(price ~ .), data = split_clean_train_df$train[, !c("id")], importance = "impurity")
##
## Type: Regression
## Number of trees: 500
## Sample size: 13821
## Number of independent variables: 22
## Mtry: 4
## Target node size: 5
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 9872336587
## R squared (OOB): 0.8799315
test_rf<-predict(baseline_rf, data = split_clean_train_df$test, type='response')$predictions
df_pred<-cbind(df_pred, test_rf)
str(df_pred)
## Classes 'data.table' and 'data.frame': 3456 obs. of 4 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ price : num 810000 953007 495000 1080000 705000 ...
## $ test_glmnet: num 870918 749509 241732 864831 833831 ...
## $ test_rf : num 975215 1064661 415067 861272 663111 ...
## - attr(*, ".internal.selfref")=<externalptr>
rmse_rf<-rmse(real=split_clean_train_df$test$price, predicted = test_rf)
mae_rf<-mae(real=split_clean_train_df$test$price, predicted = test_rf)
mape_rf<-mape(real=split_clean_train_df$test$price, predicted = test_rf)
mape_rf
## [1] 0.1255336
rsq_rf<-custom_rsq(real=split_clean_train_df$test$price, predicted = test_rf)
rsq_rf
## [1] 0.893551
xgb_reg_0<-xgboost(booster='gblinear',
data=data.matrix(split_clean_train_df$train[, !c('id','price'), with=F]),
label=split_clean_train_df$train$price,
nrounds = 100,
objective='reg:linear')
## [1] train-rmse:230610.468750
## [2] train-rmse:220846.671875
## [3] train-rmse:215033.453125
## [4] train-rmse:211002.625000
## [5] train-rmse:208047.359375
## [6] train-rmse:205814.953125
## [7] train-rmse:204087.859375
## [8] train-rmse:202721.671875
## [9] train-rmse:201616.375000
## [10] train-rmse:200701.296875
## [11] train-rmse:199925.406250
## [12] train-rmse:199251.656250
## [13] train-rmse:198653.796875
## [14] train-rmse:198112.062500
## [15] train-rmse:197612.375000
## [16] train-rmse:197144.781250
## [17] train-rmse:196701.953125
## [18] train-rmse:196278.828125
## [19] train-rmse:195871.421875
## [20] train-rmse:195477.031250
## [21] train-rmse:195093.859375
## [22] train-rmse:194720.781250
## [23] train-rmse:194356.812500
## [24] train-rmse:194000.734375
## [25] train-rmse:193653.250000
## [26] train-rmse:193312.703125
## [27] train-rmse:192980.125000
## [28] train-rmse:192654.453125
## [29] train-rmse:192335.718750
## [30] train-rmse:192024.359375
## [31] train-rmse:191719.171875
## [32] train-rmse:191420.765625
## [33] train-rmse:191129.203125
## [34] train-rmse:190844.078125
## [35] train-rmse:190565.375000
## [36] train-rmse:190292.812500
## [37] train-rmse:190026.687500
## [38] train-rmse:189766.015625
## [39] train-rmse:189511.875000
## [40] train-rmse:189262.625000
## [41] train-rmse:189019.421875
## [42] train-rmse:188782.031250
## [43] train-rmse:188549.859375
## [44] train-rmse:188322.984375
## [45] train-rmse:188100.796875
## [46] train-rmse:187884.015625
## [47] train-rmse:187672.125000
## [48] train-rmse:187465.093750
## [49] train-rmse:187262.312500
## [50] train-rmse:187064.140625
## [51] train-rmse:186870.234375
## [52] train-rmse:186680.687500
## [53] train-rmse:186495.312500
## [54] train-rmse:186314.046875
## [55] train-rmse:186136.015625
## [56] train-rmse:185962.453125
## [57] train-rmse:185792.796875
## [58] train-rmse:185626.546875
## [59] train-rmse:185463.421875
## [60] train-rmse:185303.781250
## [61] train-rmse:185147.796875
## [62] train-rmse:184995.078125
## [63] train-rmse:184845.296875
## [64] train-rmse:184698.453125
## [65] train-rmse:184554.796875
## [66] train-rmse:184413.734375
## [67] train-rmse:184275.859375
## [68] train-rmse:184140.515625
## [69] train-rmse:184007.875000
## [70] train-rmse:183877.703125
## [71] train-rmse:183750.187500
## [72] train-rmse:183624.937500
## [73] train-rmse:183502.000000
## [74] train-rmse:183381.531250
## [75] train-rmse:183263.171875
## [76] train-rmse:183147.406250
## [77] train-rmse:183033.312500
## [78] train-rmse:182921.343750
## [79] train-rmse:182811.531250
## [80] train-rmse:182703.828125
## [81] train-rmse:182598.234375
## [82] train-rmse:182493.468750
## [83] train-rmse:182391.328125
## [84] train-rmse:182290.593750
## [85] train-rmse:182191.750000
## [86] train-rmse:182094.546875
## [87] train-rmse:181999.046875
## [88] train-rmse:181904.984375
## [89] train-rmse:181812.250000
## [90] train-rmse:181721.406250
## [91] train-rmse:181632.250000
## [92] train-rmse:181544.140625
## [93] train-rmse:181457.312500
## [94] train-rmse:181371.671875
## [95] train-rmse:181287.812500
## [96] train-rmse:181205.171875
## [97] train-rmse:181123.500000
## [98] train-rmse:181043.359375
## [99] train-rmse:180964.203125
## [100] train-rmse:180886.375000
print(xgb_reg_0)
## ##### xgb.Booster
## raw: 488 bytes
## call:
## xgb.train(params = params, data = dtrain, nrounds = nrounds,
## watchlist = watchlist, verbose = verbose, print_every_n = print_every_n,
## early_stopping_rounds = early_stopping_rounds, maximize = maximize,
## save_period = save_period, save_name = save_name, xgb_model = xgb_model,
## callbacks = callbacks, booster = "gblinear", objective = "reg:linear")
## params (as set within xgb.train):
## booster = "gblinear", objective = "reg:linear", silent = "1"
## xgb.attributes:
## niter
## callbacks:
## cb.print.evaluation(period = print_every_n)
## cb.evaluation.log()
## # of features: 22
## niter: 100
## nfeatures : 22
## evaluation_log:
## iter train_rmse
## 1 230610.5
## 2 220846.7
## ---
## 99 180964.2
## 100 180886.4
test_xgb<-predict(xgb_reg_0, newdata = data.matrix(split_clean_train_df$test[, !c('id','price'), with=F]),
type='response')
df_pred<-cbind(df_pred, test_xgb)
str(df_pred)
## Classes 'data.table' and 'data.frame': 3456 obs. of 5 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ price : num 810000 953007 495000 1080000 705000 ...
## $ test_glmnet: num 870918 749509 241732 864831 833831 ...
## $ test_rf : num 975215 1064661 415067 861272 663111 ...
## $ test_xgb : num 878559 716272 366077 1020515 695939 ...
## - attr(*, ".internal.selfref")=<externalptr>
rmse_xgb<-rmse(real=split_clean_train_df$test$price, predicted = test_xgb)
mae_xgb<-mae(real=split_clean_train_df$test$price, predicted = test_xgb)
mape_xgb<-mape(real=split_clean_train_df$test$price, predicted = test_xgb)
mape_xgb
## [1] 0.3042548
rsq_xgb<-custom_rsq(real=split_clean_train_df$test$price, predicted = test_xgb)
rsq_xgb
## [1] 0.6131065
As can be seen from the following charts outlining each algorithm’s prediction metrics, Random Forest proved to have superior results when compared to the other two and will be used for subsequent feature engineering and tuning.
metrics_plot(df_pred, c('glmnet','rf','xgb_reg'), verbose = T)
## method rmse mae mape rsq
## 1: rf 94173.38 62314.86 0.1255336 0.893551
A number of features will be created in the hopes their inclusion into the model will improve the overall prediction abilities. The features created include 1) Weekday/Weekend flag, 2) Holiday flag, 3) Renovation flag (defined as when the 2015 area of either the lot or living room is different from the original area), 4) Missing Renovation Year flag (as the presence of a renovation year should correspond to a positive renovation flag), and 5) House Age. It was found that the inclusion of the first, third, and fourth features actually improved model performance as seen from the below graphs.
df_pipeline_pred<-split_clean_train_df$test[, .(id=1:.N, price, test_rf)]
colnames(df_pipeline_pred) <-c('id','price','baseline')
fe_train_df1 <- clean_train_df
fe_test_df1 <- clean_test_df
fe_train_df1$weekend <-as.logical(is.weekend(clean_train_df$date))
fe_test_df1$weekend <-as.logical(is.weekend(clean_test_df$date))
fe_train_df1$date = NULL
fe_output_1 <- split_and_train(fe_train_df1, df_pipeline_pred)
## Ranger result
##
## Call:
## ranger(formula = as.formula(price ~ .), data = split_fe_train_df$train[, !c("id")], importance = "impurity")
##
## Type: Regression
## Number of trees: 500
## Sample size: 13821
## Number of independent variables: 23
## Mtry: 4
## Target node size: 5
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 9932652150
## R squared (OOB): 0.8791979
metrics_plot(fe_output_1[[3]], c('baseline','fe1'), verbose = T)
## method rmse mae mape rsq
## 1: baseline 94173.38 62314.86 0.1255336 0.893551
fe_train_df2 <- clean_train_df
fe_test_df2 <- clean_test_df
fe_train_df2$holiday <-as.logical(is.holiday(clean_train_df$date))
fe_test_df2$holiday <-as.logical(is.holiday(clean_test_df$date))
fe_train_df2$date = NULL
fe_output_2 <- split_and_train(fe_train_df2, fe_output_1[[3]])
## Ranger result
##
## Call:
## ranger(formula = as.formula(price ~ .), data = split_fe_train_df$train[, !c("id")], importance = "impurity")
##
## Type: Regression
## Number of trees: 500
## Sample size: 13821
## Number of independent variables: 23
## Mtry: 4
## Target node size: 5
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 9922857033
## R squared (OOB): 0.879317
metrics_plot(fe_output_2[[3]], c('baseline','fe1','fe2'), verbose = T)
## method rmse mae mape rsq
## 1: baseline 94173.38 62314.86 0.1255336 0.893551
clean_test_df$date = NULL
fe_train_df3 <- clean_train_df
fe_test_df3 <- clean_test_df
fe_train_df3$renovated <- ifelse(((fe_train_df3$sqft_living != fe_train_df3$sqft_living15) |
(fe_train_df3$sqft_lot != fe_train_df3$sqft_lot15)), 1, 0)
fe_test_df3$rennovated <- ifelse(((fe_test_df3$sqft_living != fe_test_df3$sqft_living15) |
(fe_test_df3$sqft_lot != fe_test_df3$sqft_lot15)), 1, 0)
fe_output_3 <- split_and_train(fe_train_df3, fe_output_2[[3]])
## Ranger result
##
## Call:
## ranger(formula = as.formula(price ~ .), data = split_fe_train_df$train[, !c("id")], importance = "impurity")
##
## Type: Regression
## Number of trees: 500
## Sample size: 13821
## Number of independent variables: 24
## Mtry: 4
## Target node size: 5
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 10101553722
## R squared (OOB): 0.8771437
metrics_plot(fe_output_3[[3]], c('baseline','fe1','fe2','fe3'), verbose = T)
## method rmse mae mape rsq
## 1: baseline 94173.38 62314.86 0.1255336 0.893551
fe_train_df4 <- fe_train_df3
fe_test_df4 <- fe_test_df3
fe_train_df4$missing_ren_year <- ifelse(((fe_train_df4$yr_renovated == 0) & (fe_train_df4$renovated == T)), 1, 0)
fe_test_df4$missing_ren_year <- ifelse(((fe_test_df4$yr_renovated == 0) & (fe_test_df4$renovated == T)), 1, 0)
fe_output_4 <- split_and_train(fe_train_df4, fe_output_3[[3]])
## Ranger result
##
## Call:
## ranger(formula = as.formula(price ~ .), data = split_fe_train_df$train[, !c("id")], importance = "impurity")
##
## Type: Regression
## Number of trees: 500
## Sample size: 13821
## Number of independent variables: 25
## Mtry: 5
## Target node size: 5
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 9763103935
## R squared (OOB): 0.88126
metrics_plot(fe_output_4[[3]], c('baseline','fe1','fe2','fe3','fe4'), verbose = T)
## method rmse mae mape rsq
## 1: fe4 93277.24 61544.41 0.1235152 0.8955673
fe_train_df5 <- clean_train_df
fe_test_df5 <- clean_test_df
fe_train_df5$house_age <- year(Sys.Date()) - fe_train_df5$yr_built
fe_test_df5$house_age <- year(Sys.Date()) - fe_test_df5$yr_built
fe_output_5 <- split_and_train(fe_train_df5, fe_output_4[[3]])
## Ranger result
##
## Call:
## ranger(formula = as.formula(price ~ .), data = split_fe_train_df$train[, !c("id")], importance = "impurity")
##
## Type: Regression
## Number of trees: 500
## Sample size: 13821
## Number of independent variables: 24
## Mtry: 4
## Target node size: 5
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 10076946986
## R squared (OOB): 0.877443
metrics_plot(fe_output_5[[3]], c('baseline','fe1','fe2','fe3','fe4','fe5'), verbose = T)
## method rmse mae mape rsq
## 1: fe4 93277.24 61544.41 0.1235152 0.8955673
A combination of the first, third, and fourth engineered features are combined to train on the next model iteration. This combined model is found to be superior to any of the other models separately and therefore these features will be included in the tuning phase.
fe_train_df_final <- fe_train_df1
fe_test_df_final<- fe_test_df1
fe_train_df_final$renovated <- ifelse(((fe_train_df_final$sqft_living != fe_train_df_final$sqft_living15) |
(fe_train_df_final$sqft_lot != fe_train_df_final$sqft_lot15)), 1, 0)
fe_test_df_final$renovated <- ifelse(((fe_test_df_final$sqft_living != fe_test_df_final$sqft_living15) |
(fe_test_df_final$sqft_lot != fe_test_df_final$sqft_lot15)), 1, 0)
fe_train_df_final$missing_ren_year <- ifelse(((fe_train_df_final$yr_renovated == 0)
& (fe_train_df_final$renovated == T)), 1, 0)
fe_test_df_final$missing_ren_year <- ifelse(((fe_test_df_final$yr_renovated == 0)
& (fe_test_df_final$renovated == T)), 1, 0)
fe_output_final <- split_and_train(fe_train_df_final, fe_output_5[[3]])
## Ranger result
##
## Call:
## ranger(formula = as.formula(price ~ .), data = split_fe_train_df$train[, !c("id")], importance = "impurity")
##
## Type: Regression
## Number of trees: 500
## Sample size: 13821
## Number of independent variables: 25
## Mtry: 5
## Target node size: 5
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 9750571539
## R squared (OOB): 0.8814124
metrics_plot(fe_output_final[[3]], c('baseline','fe1','fe2','fe3','fe4','fe5','final_fe'), verbose = T)
## method rmse mae mape rsq
## 1: fe4 93277.24 61544.41 0.1235152 0.8955673
Per the documentation, the TuneRanger package seeks to find the optimal minimum node size, sample fraction, and mtry for a given Ranger model. Rather than using the standard cross-validation folds, out-of-bag (OOB) predictions are used for faster performance time. This final tuned model further improves the performance from previous iterations with a MAPE on the holdout of approximately 0.1152482.
# test_rf_tuned <- csrf(
# formula = as.formula(price~.),
# training_data = split_clean_train_df$train[,!c('id')],
# test_data = split_clean_train_df$test[,!c('id')],
# params1 = list(importance = 'impurity'),
# params2 = list(num.trees = 50)
# )
##################################################### TESTING ######################################################
####################################################################################################################
####################################################################################################################
final_train_df <- fe_output_final[[1]]
final_test_df <- fe_test_df_final
#Need to convert to integers as task doesn't support categoricals
#Train/Test Split
final_train_df$train$weekend <- ifelse((final_train_df$train$weekend), 1, 0)
final_train_df$test$weekend <- ifelse((final_train_df$test$weekend), 1, 0)
#Validation Split
final_test_df$weekend <- ifelse((final_test_df$weekend), 1, 0)
task = makeRegrTask(data = final_train_df$train[,!c('id')], target = "price")
## Warning in makeTask(type = type, data = data, weights = weights, blocking
## = blocking, : Provided data is not a pure data.frame but from class
## data.table, hence it will be converted.
# Estimate runtime
estimateTimeTuneRanger(task)
## Approximated time for tuning: 3H 14M 51S
# Tuning
res = tuneRanger(task, num.trees = 500, num.threads = 2, iters = 50, save.file.path = NULL)
## Computing y column(s) for design. Not provided.
## [mbo] 0: mtry=9; min.node.size=151; sample.fraction=0.432 : y = 1.39e+10 : 6.2 secs : initdesign
## [mbo] 0: mtry=17; min.node.size=113; sample.fraction=0.838 : y = 1.12e+10 : 19.3 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=25; sample.fraction=0.351 : y = 1.33e+10 : 4.5 secs : initdesign
## [mbo] 0: mtry=21; min.node.size=1.16e+03; sample.fraction=0.615 : y = 2.46e+10 : 7.0 secs : initdesign
## [mbo] 0: mtry=10; min.node.size=8; sample.fraction=0.563 : y = 9.17e+09 : 17.4 secs : initdesign
## [mbo] 0: mtry=9; min.node.size=84; sample.fraction=0.644 : y = 1.14e+10 : 11.1 secs : initdesign
## [mbo] 0: mtry=11; min.node.size=1.62e+03; sample.fraction=0.436 : y = 3.24e+10 : 1.8 secs : initdesign
## [mbo] 0: mtry=15; min.node.size=11; sample.fraction=0.278 : y = 9.77e+09 : 11.9 secs : initdesign
## [mbo] 0: mtry=18; min.node.size=6; sample.fraction=0.735 : y = 8.96e+09 : 37.8 secs : initdesign
## [mbo] 0: mtry=2; min.node.size=330; sample.fraction=0.78 : y = 2.15e+10 : 2.8 secs : initdesign
## [mbo] 0: mtry=19; min.node.size=558; sample.fraction=0.461 : y = 1.96e+10 : 11.4 secs : initdesign
## [mbo] 0: mtry=11; min.node.size=5; sample.fraction=0.79 : y = 8.85e+09 : 48.3 secs : initdesign
## [mbo] 0: mtry=3; min.node.size=14; sample.fraction=0.506 : y = 1.18e+10 : 8.9 secs : initdesign
## [mbo] 0: mtry=14; min.node.size=412; sample.fraction=0.738 : y = 1.55e+10 : 12.6 secs : initdesign
## [mbo] 0: mtry=14; min.node.size=2; sample.fraction=0.585 : y = 8.86e+09 : 40.2 secs : initdesign
## [mbo] 0: mtry=6; min.node.size=157; sample.fraction=0.38 : y = 1.58e+10 : 3.7 secs : initdesign
## [mbo] 0: mtry=23; min.node.size=62; sample.fraction=0.541 : y = 1.09e+10 : 31.6 secs : initdesign
## [mbo] 0: mtry=16; min.node.size=198; sample.fraction=0.314 : y = 1.55e+10 : 7.0 secs : initdesign
## [mbo] 0: mtry=20; min.node.size=3; sample.fraction=0.405 : y = 9.13e+09 : 31.6 secs : initdesign
## [mbo] 0: mtry=5; min.node.size=4; sample.fraction=0.229 : y = 1.1e+10 : 5.8 secs : initdesign
## [mbo] 0: mtry=18; min.node.size=41; sample.fraction=0.811 : y = 9.76e+09 : 28.5 secs : initdesign
## [mbo] 0: mtry=4; min.node.size=2; sample.fraction=0.669 : y = 9.98e+09 : 14.4 secs : initdesign
## [mbo] 0: mtry=23; min.node.size=1.89e+03; sample.fraction=0.338 : y = 3.65e+10 : 2.2 secs : initdesign
## [mbo] 0: mtry=7; min.node.size=569; sample.fraction=0.621 : y = 1.93e+10 : 4.2 secs : initdesign
## [mbo] 0: mtry=25; min.node.size=32; sample.fraction=0.206 : y = 1.11e+10 : 12.0 secs : initdesign
## [mbo] 0: mtry=8; min.node.size=2.33e+03; sample.fraction=0.267 : y = 3.76e+10 : 0.9 secs : initdesign
## [mbo] 0: mtry=24; min.node.size=3; sample.fraction=0.857 : y = 9.94e+09 : 72.8 secs : initdesign
## [mbo] 0: mtry=1; min.node.size=20; sample.fraction=0.887 : y = 2.57e+10 : 2.6 secs : initdesign
## [mbo] 0: mtry=22; min.node.size=924; sample.fraction=0.698 : y = 2.14e+10 : 8.1 secs : initdesign
## [mbo] 0: mtry=13; min.node.size=17; sample.fraction=0.486 : y = 9.41e+09 : 14.7 secs : initdesign
## [mbo] 1: mtry=10; min.node.size=3; sample.fraction=0.505 : y = 9.09e+09 : 18.5 secs : infill_cb
## [mbo] 2: mtry=15; min.node.size=2; sample.fraction=0.799 : y = 8.82e+09 : 43.1 secs : infill_cb
## [mbo] 3: mtry=24; min.node.size=2; sample.fraction=0.202 : y = 9.83e+09 : 18.8 secs : infill_cb
## [mbo] 4: mtry=13; min.node.size=3; sample.fraction=0.692 : y = 8.86e+09 : 30.6 secs : infill_cb
## [mbo] 5: mtry=9; min.node.size=2; sample.fraction=0.426 : y = 9.34e+09 : 15.5 secs : infill_cb
## [mbo] 6: mtry=20; min.node.size=4; sample.fraction=0.784 : y = 9.07e+09 : 46.8 secs : infill_cb
## [mbo] 7: mtry=11; min.node.size=2; sample.fraction=0.737 : y = 8.87e+09 : 30.2 secs : infill_cb
## [mbo] 8: mtry=17; min.node.size=6; sample.fraction=0.452 : y = 9.1e+09 : 20.2 secs : infill_cb
## [mbo] 9: mtry=16; min.node.size=2; sample.fraction=0.785 : y = 8.87e+09 : 43.2 secs : infill_cb
## [mbo] 10: mtry=25; min.node.size=2; sample.fraction=0.44 : y = 9.27e+09 : 37.3 secs : infill_cb
## [mbo] 11: mtry=11; min.node.size=5; sample.fraction=0.616 : y = 8.95e+09 : 22.0 secs : infill_cb
## [mbo] 12: mtry=7; min.node.size=2; sample.fraction=0.564 : y = 9.25e+09 : 14.4 secs : infill_cb
## [mbo] 13: mtry=19; min.node.size=2; sample.fraction=0.41 : y = 9.1e+09 : 29.0 secs : infill_cb
## [mbo] 14: mtry=14; min.node.size=5; sample.fraction=0.78 : y = 8.84e+09 : 36.1 secs : infill_cb
## [mbo] 15: mtry=19; min.node.size=13; sample.fraction=0.436 : y = 9.34e+09 : 32.7 secs : infill_cb
## [mbo] 16: mtry=12; min.node.size=2; sample.fraction=0.777 : y = 8.86e+09 : 43.2 secs : infill_cb
## [mbo] 17: mtry=18; min.node.size=2; sample.fraction=0.774 : y = 8.92e+09 : 54.7 secs : infill_cb
## [mbo] 18: mtry=18; min.node.size=35; sample.fraction=0.586 : y = 9.76e+09 : 31.9 secs : infill_cb
## [mbo] 19: mtry=25; min.node.size=14; sample.fraction=0.751 : y = 9.86e+09 : 58.2 secs : infill_cb
## [mbo] 20: mtry=25; min.node.size=5; sample.fraction=0.327 : y = 9.51e+09 : 27.6 secs : infill_cb
## [mbo] 21: mtry=11; min.node.size=2; sample.fraction=0.662 : y = 8.83e+09 : 25.7 secs : infill_cb
## [mbo] 22: mtry=4; min.node.size=2; sample.fraction=0.452 : y = 1.04e+10 : 9.2 secs : infill_cb
## [mbo] 23: mtry=9; min.node.size=4; sample.fraction=0.691 : y = 8.97e+09 : 27.3 secs : infill_cb
## [mbo] 24: mtry=17; min.node.size=6; sample.fraction=0.792 : y = 8.94e+09 : 55.0 secs : infill_cb
## [mbo] 25: mtry=13; min.node.size=2; sample.fraction=0.776 : y = 8.83e+09 : 47.8 secs : infill_cb
## [mbo] 26: mtry=15; min.node.size=2; sample.fraction=0.396 : y = 9.14e+09 : 29.1 secs : infill_cb
## [mbo] 27: mtry=19; min.node.size=74; sample.fraction=0.742 : y = 1.07e+10 : 27.9 secs : infill_cb
## [mbo] 28: mtry=9; min.node.size=7; sample.fraction=0.455 : y = 9.41e+09 : 12.4 secs : infill_cb
## [mbo] 29: mtry=23; min.node.size=3; sample.fraction=0.505 : y = 9.13e+09 : 45.0 secs : infill_cb
## [mbo] 30: mtry=9; min.node.size=2; sample.fraction=0.636 : y = 9e+09 : 35.2 secs : infill_cb
## [mbo] 31: mtry=18; min.node.size=5; sample.fraction=0.799 : y = 8.99e+09 : 56.7 secs : infill_cb
## [mbo] 32: mtry=14; min.node.size=3; sample.fraction=0.747 : y = 8.84e+09 : 39.3 secs : infill_cb
## [mbo] 33: mtry=24; min.node.size=2; sample.fraction=0.675 : y = 9.34e+09 : 83.3 secs : infill_cb
## [mbo] 34: mtry=16; min.node.size=2; sample.fraction=0.557 : y = 8.89e+09 : 40.3 secs : infill_cb
## [mbo] 35: mtry=21; min.node.size=2; sample.fraction=0.52 : y = 9.04e+09 : 45.9 secs : infill_cb
## [mbo] 36: mtry=11; min.node.size=3; sample.fraction=0.771 : y = 8.82e+09 : 31.2 secs : infill_cb
## [mbo] 37: mtry=14; min.node.size=2; sample.fraction=0.69 : y = 8.86e+09 : 80.1 secs : infill_cb
## [mbo] 38: mtry=11; min.node.size=4; sample.fraction=0.723 : y = 8.86e+09 : 39.0 secs : infill_cb
## [mbo] 39: mtry=15; min.node.size=2; sample.fraction=0.5 : y = 8.96e+09 : 36.6 secs : infill_cb
## [mbo] 40: mtry=11; min.node.size=3; sample.fraction=0.669 : y = 8.86e+09 : 40.1 secs : infill_cb
## [mbo] 41: mtry=18; min.node.size=7; sample.fraction=0.527 : y = 9.03e+09 : 44.7 secs : infill_cb
## [mbo] 42: mtry=14; min.node.size=3; sample.fraction=0.788 : y = 8.86e+09 : 47.6 secs : infill_cb
## [mbo] 43: mtry=13; min.node.size=5; sample.fraction=0.745 : y = 8.81e+09 : 48.9 secs : infill_cb
## [mbo] 44: mtry=16; min.node.size=6; sample.fraction=0.682 : y = 8.86e+09 : 44.6 secs : infill_cb
## [mbo] 45: mtry=13; min.node.size=2; sample.fraction=0.223 : y = 9.82e+09 : 17.7 secs : infill_cb
## [mbo] 46: mtry=15; min.node.size=2; sample.fraction=0.766 : y = 8.84e+09 : 45.7 secs : infill_cb
## [mbo] 47: mtry=13; min.node.size=4; sample.fraction=0.761 : y = 8.81e+09 : 45.7 secs : infill_cb
## [mbo] 48: mtry=22; min.node.size=2; sample.fraction=0.428 : y = 9.14e+09 : 41.3 secs : infill_cb
## [mbo] 49: mtry=16; min.node.size=2; sample.fraction=0.66 : y = 8.85e+09 : 38.9 secs : infill_cb
## [mbo] 50: mtry=11; min.node.size=5; sample.fraction=0.757 : y = 8.88e+09 : 44.2 secs : infill_cb
# Model with the new tuned hyperparameters
res$model
## Model for learner.id=regr.ranger; learner.class=regr.ranger
## Trained on: task.id = final_train_df$train[, !c("id")]; obs = 13821; features = 25
## Hyperparameters: num.threads=2,verbose=FALSE,respect.unordered.factors=order,mtry=13,min.node.size=4,sample.fraction=0.769,num.trees=500,replace=FALSE
# Prediction
final <- predict(res$model, newdata = final_train_df$test[,!c('id')])$data$response
## Warning in predict.WrappedModel(res$model, newdata =
## final_train_df$test[, : Provided data for prediction is not a pure
## data.frame but from class data.table, hence it will be converted.
df_pipeline_final<-cbind(fe_output_final[[3]], final)
metrics_plot(df_pipeline_final, c('baseline','fe1','fe2','fe3','fe4','fe5','final_fe','final_model'), verbose = T)
## method rmse mae mape rsq
## 1: final_model 88726.88 57831.07 0.1152482 0.9055079
The following sections retrain the model using the feature engineering and optimal hyperparamters from previous sections on the entire training set. This retrained model is then used to make predictions on the validation set, which is finally prepared for the final CSV format.
final_total_train<- rbind(final_train_df$train, final_train_df$test)
#Running this was computationally expensive and ultimately unsuccessful
# fit_control <- trainControl(method = "cv", number = 3, verboseIter = TRUE, search = "random")
# final_rf <- train(as.factor(price) ~ ., data = final_total_train[,!c('id')],
# method = "ranger",
# trControl = fit_control)
final_rf <- ranger(formula = as.formula(price~.), data=final_total_train[,!c('id')],
importance = 'impurity',
mtry = res$recommended.pars$mtry,
min.node.size = res$recommended.pars$min.node.size,
sample.fraction = res$recommended.pars$sample.fraction)
## Growing trees.. Progress: 80%. Estimated remaining time: 7 seconds.
print(final_rf)
## Ranger result
##
## Call:
## ranger(formula = as.formula(price ~ .), data = final_total_train[, !c("id")], importance = "impurity", mtry = res$recommended.pars$mtry, min.node.size = res$recommended.pars$min.node.size, sample.fraction = res$recommended.pars$sample.fraction)
##
## Type: Regression
## Number of trees: 500
## Sample size: 17277
## Number of independent variables: 25
## Mtry: 13
## Target node size: 4
## Variable importance mode: impurity
## Splitrule: variance
## OOB prediction error (MSE): 8543289410
## R squared (OOB): 0.8963732
final_test_rf<-predict(final_rf, data = final_test_df, type='response')$predictions
prediction<-clean_test_df[, .(id=id,final_test_rf)]
head(prediction)
## id final_test_rf
## 1: 6414100192 481312.9
## 2: 6054650070 399438.2
## 3: 16000397 208393.7
## 4: 2524049179 1205436.0
## 5: 8562750320 602436.8
## 6: 7589200193 521044.3
This chart displays the variable importance sorted by node impurity (i.e. the variation generated when observations reach that variable). Many of the most important factors influencing house pricing are intuitive (i.e. the size/area has a clear positive relationship with price).
importance_df <- data.frame(final_rf$variable.importance)
setDT(importance_df, keep.rownames = TRUE)[]
## rn final_rf.variable.importance
## 1: bedrooms 3.022104e+12
## 2: bathrooms 1.531373e+13
## 3: sqft_living 2.327707e+14
## 4: sqft_lot 1.403515e+13
## 5: floors 2.182092e+12
## 6: waterfront 7.469033e+12
## 7: view 1.943842e+13
## 8: condition 4.159362e+12
## 9: grade 2.998744e+14
## 10: sqft_above 4.252713e+13
## 11: sqft_basement 8.176851e+12
## 12: yr_built 3.602017e+13
## 13: yr_renovated 1.850260e+12
## 14: zipcode 2.312867e+13
## 15: lat 2.295265e+14
## 16: long 4.790245e+13
## 17: sqft_living15 6.870708e+13
## 18: sqft_lot15 1.471390e+13
## 19: year 1.253822e+12
## 20: month 5.634836e+12
## 21: day 7.456911e+12
## 22: day_of_week 3.607965e+12
## 23: weekend 2.289366e+11
## 24: renovated 1.787497e+11
## 25: missing_ren_year 6.487339e+11
## rn final_rf.variable.importance
colnames(importance_df) <- c('variable', 'importance')
ggplot(importance_df, aes(x=reorder(variable,importance), y=importance, fill=importance)) +
geom_bar(stat="identity", position="dodge")+ coord_flip()+
ylab("Variable Importance")+
xlab("")+
ggtitle("Information Value Summary")+
guides(fill=F)+
scale_fill_gradient(low="red", high="blue")
colnames(prediction) <- c('id', 'target')
write.csv(prediction, file = "output.csv")